Load packages

This example uses the tidyverse suite of packages.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(visdat)
library(ggbeeswarm)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Read data

Please download the final project data from Canvas. If this Rmarkdown file is located in the same directory as the downloaded CSV file, it will be able to load in the data for you. It is highly recommended that you use an RStudio RProject to more easily manage the working directory and file paths of the code and objects associated with the final project.

The code chunk below reads in the final project data.

df <- readr::read_csv("fall2022_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The readr::read_csv() function displays the data types and column names associated with the data. However, a glimpse is shown below that reveals the number of rows and also shows some of the representative values for the columns.

df %>% glimpse()
## Rows: 1,252
## Columns: 11
## $ x1     <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.0…
## $ x2     <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.0…
## $ x3     <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.0…
## $ x4     <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.0…
## $ v1     <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.2…
## $ v2     <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.0…
## $ v3     <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.1…
## $ v4     <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.6…
## $ v5     <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.2…
## $ m      <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"…
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874, …

Logit transformation of the output variable.

df_reg<- df%>% 
  mutate(y = boot::logit(output)) %>% 
  select(x1:x4,v1:v5,m,y)%>%
  glimpse()
## Rows: 1,252
## Columns: 11
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.02440…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.03183…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.02205…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.05575…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.23012…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.00521…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.16544…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.69071…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.20876…
## $ m  <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ y  <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320223, …

Binary classfication Data

df_classify<-df %>% 
  
  mutate(outcome = ifelse(output < 0.33, 'event', 'non_event'),
         outcome = factor(outcome, levels = c("event", "non_event"))) %>% 
         select(x1:x4,v1:v5,m,outcome)%>%
        
  glimpse()
## Rows: 1,252
## Columns: 11
## $ x1      <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.…
## $ x2      <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.…
## $ x3      <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.…
## $ x4      <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.…
## $ v1      <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.…
## $ v2      <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.…
## $ v3      <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.…
## $ v4      <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.…
## $ v5      <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.…
## $ m       <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …

Derived features Dataset Regression

df_derived_reg<-df %>% 
  mutate(x5 = 1 - (x1 + x2 + x3 + x4),
         w = x2 / (x3 + x4),
         z = (x1 + x2) / (x5 + x4),
         t = v1 * v2,
         y = boot::logit(output)) %>% 
         select(m,x5,t,w,z,y)%>%
        
  glimpse()
## Rows: 1,252
## Columns: 6
## $ m  <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ x5 <dbl> 0.212588, 0.153418, 0.689014, 0.348834, 0.388381, 0.625701, 0.86595…
## $ t  <dbl> 0.009277586, 0.009294651, 0.151249854, 0.266428788, 2.604629249, 5.…
## $ w  <dbl> 0.50619858, 0.47195344, 0.07709835, 0.10712990, 0.80796683, 0.08579…
## $ z  <dbl> 1.25050808, 1.39745312, 0.05731369, 0.83844661, 0.65315580, 0.08546…
## $ y  <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320223, …

Derived features Dataset Classification

df_derived_classify<-df %>% 
  
  mutate(x5 = 1 - (x1 + x2 + x3 + x4),
         w = x2 / (x3 + x4),
         z = (x1 + x2) / (x5 + x4),
         t = v1 * v2,
         outcome = ifelse(output < 0.33, 'event', 'non_event'),
         outcome = factor(outcome, levels = c("event", "non_event"))) %>% 
         select(x5,w,z,t,m,outcome)%>%
        
  glimpse()
## Rows: 1,252
## Columns: 6
## $ x5      <dbl> 0.212588, 0.153418, 0.689014, 0.348834, 0.388381, 0.625701, 0.…
## $ w       <dbl> 0.50619858, 0.47195344, 0.07709835, 0.10712990, 0.80796683, 0.…
## $ z       <dbl> 1.25050808, 1.39745312, 0.05731369, 0.83844661, 0.65315580, 0.…
## $ t       <dbl> 0.009277586, 0.009294651, 0.151249854, 0.266428788, 2.60462924…
## $ m       <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …

Attributes of the Data set

names(df)
##  [1] "x1"     "x2"     "x3"     "x4"     "v1"     "v2"     "v3"     "v4"    
##  [9] "v5"     "m"      "output"

The first five rows of the Data set

df[1:5,]
## # A tibble: 5 × 11
##       x1     x2    x3     x4    v1     v2    v3    v4    v5 m     output
##    <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <chr>  <dbl>
## 1 0.0259 0.256  0.493 0.0128 0.276 0.0337  1.17 0.408 0.525 A      0.786
## 2 0.0308 0.262  0.498 0.0558 0.343 0.0271  1.26 0.664 2.87  A      0.73 
## 3 0.0193 0.0209 0.258 0.0124 5.00  0.0303  1.30 0.413 0.409 A      0.996
## 4 0.306  0.0334 0.255 0.0562 5.09  0.0523  1.32 0.652 0.862 A      0.326
## 5 0.0313 0.259  0.264 0.0566 5.03  0.518   1.37 0.534 6.45  A      0.735

The data have continuous inputs and a categorical input. The continuous inputs consist of two groups of variables, the “x-variables”, x1 through x4, and the “v-variables”, v1 through v5. The categorical input is m. The response is continuous and is named output.

Checking Missing Values

visdat::vis_miss(df)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
##   Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.

No Missing values present in the Data set

Check the data types visually.

visdat::vis_dat(df)

There is only one Categorical Column and rest are Continuous.

Check the number of unique values per variable.

df %>% purrr::map_dbl(n_distinct)
##     x1     x2     x3     x4     v1     v2     v3     v4     v5      m output 
##   1245   1250   1250   1235   1252   1249   1252   1252   1252      5    690

One of the variables have relatively few unique values! Checking the counts associated with each unique value for m.

df %>% count(m)
## # A tibble: 5 × 2
##   m         n
##   <chr> <int>
## 1 A       227
## 2 B       252
## 3 C       265
## 4 D       260
## 5 E       248

Graphically display counts with bar charts of the Categorical input . The m bar chart is below.

ggplot(df, aes(x=as.factor(m))) +
  geom_bar(color="black", fill="gold")+
  scale_y_continuous(expand=c(0.1,0.1)) +
  geom_text(stat='count',aes(label = ..count..), vjust = -1)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

M values are uniform.

Check the number of unique combinations of all inputs.

df %>% 
  select(-output) %>% 
  distinct() %>% 
  dim()
## [1] 1252   10

No Duplicate Rows as there are 1252 rows of Data in the Data set

Classfication_Data Exploration

Bar plot of Classification Outcome

ggplot(df_classify, aes(x=as.factor(outcome))) +
  geom_bar(color="black", fill="blue")+
  scale_y_continuous(expand=c(0.1,0.1)) +
  geom_text(stat='count',aes(label = ..count..), vjust = -1)

Bar plot of Classification Outcome with input categorical value m.

df_classify%>%
  ggplot(mapping = aes(x = m, fill = outcome, label = ..count..))+
  geom_bar(width = 0.5)+
  stat_count(geom = "text", colour = "black", size = 3.5, ,position=position_stack(vjust=0.5))+
  theme_bw()+
  scale_fill_brewer(palette="Accent")

Visualize the combinations as a heatmap.

df_classify %>% 
  count(m, outcome) %>% 
  ggplot(mapping = aes(x = as.factor(m), y = as.factor(outcome))) +
  geom_tile(mapping = aes(fill = n), color = 'black') +
  geom_text(mapping = aes(label = n,
                          color = n > median(n)),
            size = 7) +
  scale_fill_viridis_c(guide = 'none', option = 'magma') +
  scale_color_manual(guide = 'none',
                     values = c("TRUE" = 'black',
                                "FALSE" = 'white')) +
  theme_bw()

Non event is more compared to event for the input value m with respect to Outcome.

Continuous Input Values EDA

Correlation between continuous output w.r.t continuous inputs

df_reg %>% select(x1,x2,x3,x4,v1,v2,v3,v4,v5 ,y) %>% cor() %>% corrplot::corrplot(type = 'upper')

Input Disturbutions

input_names <- c("x1", "x2", "x3", "x4", "v1","v2","v3","v5")

df_reg %>% 
  select(all_of(input_names)) %>% 
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid")) %>% 
  ggplot(mapping = aes(x = value)) +
  geom_histogram(bins = 35) +
  facet_wrap(~name, scales = "free") +
  theme_bw() +
  theme(axis.text.y = element_blank())

df_reg %>% 
  select(-v1,-v2,-v3,-v4,-v5) %>% 
  GGally::ggpairs(progress = FALSE,
                  diag = list(continuous = GGally::wrap('barDiag', bins=25))) +
  theme_bw()
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_reg %>% 
  select(-x1,-x2,-x3,x4) %>% 
  GGally::ggpairs(progress = FALSE,
                  diag = list(continuous = GGally::wrap('barDiag', bins=25))) +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_reg %>% 
  select(x1,x2,x3,x4,v1,v2,v3,v4,v5) %>% 
  cor() %>% 
  corrplot::corrplot(method = 'number', type = 'upper',order="hclust")

Derived input correlation

df_derived_reg %>% 
  select(x5,t,w,z) %>% 
  cor() %>% 
  corrplot::corrplot(method = 'number', type = 'upper',order="hclust")

#create pairs plot
pairs.panels(df_derived_reg)

pairs.panels(df_derived_classify)

Categorical with Continous Inputs with x inputs

df_reg  %>%
  pivot_longer(starts_with("x")) %>%
  ggplot(aes(x=value)) +
    geom_density(aes(color=m), bins=50) +
    facet_wrap(~name, scales="free", ncol=8) +
    ylab("") + xlab("")
## Warning in geom_density(aes(color = m), bins = 50): Ignoring unknown parameters:
## `bins`

All x values have a smiliar trend with respect to m.

Categorical with Continous Inputs with v variables

df_reg  %>%
    pivot_longer(starts_with("v")) %>%
    ggplot(aes(x=value)) +
    geom_density(aes(color=m), bins=50) +
    facet_wrap(~name, scales="free", ncol=8) +
    ylab("") + xlab("")
## Warning in geom_density(aes(color = m), bins = 50): Ignoring unknown parameters:
## `bins`

All v values have a smiliar trend with respect to m.

Output Visualization with Logit Transformation.

df%>% select(output) %>%
  ggplot() +
    geom_histogram(aes(x=output), bins=50)

After logit Transformation of y

df_reg%>% select(y) %>%
  ggplot() +
    geom_histogram(aes(x=y), bins=50)

Additional Scatter plot Visualization for the input features.

ggplot(df, aes(x = x1, y = output)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = x1, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = x2, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = x3, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = x4, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = v1, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = v2, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = v3, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = v4, y = y)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(df_reg, aes(x = v5, y = y)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE, colour = "red")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_reg, aes(x = x2, y = y, colour = m)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(
    aes(x = x2, y = y),
    method = "lm", se = FALSE, inherit.aes = FALSE,
    colour = "black", size = 1, linetype = "dashed"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'